Exploratory Visualization

This section is intended to give an overview of the response distributions from our pilot.

Data

Load Worker Responses from Pilot

# read in data 
responses_df <- read_csv("pilot-anonymous.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   workerId = col_character(),
##   condition = col_character(),
##   batch = col_integer(),
##   counterbalance = col_integer(),
##   numeracy = col_integer(),
##   bet = col_integer(),
##   outcome = col_character(),
##   sdDiff = col_integer(),
##   trial = col_integer(),
##   trialIdx = col_integer(),
##   repaired = col_character()
## )
## See spec(...) for full column specifications.
# rename to convert away from camel case
responses_df <- responses_df %>%
  rename(
    ground_truth=groundTruth,
    sd_diff=sdDiff,
    worker_id=workerId,
    start_time=startTime,
    resp_time=respTime,
    trial_dur=trialDur
  ) %>%
  mutate(
    trial_dur = ifelse(trial_dur < 0, 0, trial_dur), # avoid negative trial durations from faulty reconstruction (only one case)
    cles = ifelse(cles == 0, 0.25, cles),            # avoid responses equal to zero
    cles = ifelse(cles == 100, 99.75, cles)            # avoid responses equal to one-hundred
  ) 

head(responses_df)
## # A tibble: 6 x 22
##   worker_id condition batch counterbalance totalBonus duration numeracy
##   <chr>     <chr>     <int>          <int>      <dbl>    <dbl>    <int>
## 1 7553db84  HOPs          5              5      0.858     687.        6
## 2 7553db84  HOPs          5              5      0.858     687.        6
## 3 7553db84  HOPs          5              5      0.858     687.        6
## 4 7553db84  HOPs          5              5      0.858     687.        6
## 5 7553db84  HOPs          5              5      0.858     687.        6
## 6 7553db84  HOPs          5              5      0.858     687.        6
## # ... with 15 more variables: bet <int>, bonus <dbl>, cles <dbl>,
## #   ground_truth <dbl>, keep <dbl>, outcome <chr>, pay <dbl>,
## #   resp_time <dbl>, sd_diff <int>, start_time <dbl>, trial <int>,
## #   trial_dur <dbl>, trialIdx <int>, win <dbl>, repaired <chr>

Load Stimuli-Generating Data

# data used to create stimuli
load("./conds_df.Rda")

Response Distributions

CLES Judgments

As we would expect based on the ubiquitous linear log odds representation of probability, CLES judgments tend to be biased toward 50% relative to the ground truth. This is not so much the case for HOPs, howeverm responses are highly variable.

for (cond in unique(responses_df$condition)) {
  plt <- responses_df %>% filter(condition == cond) %>%
    ggplot(aes(x=cles)) +
    geom_vline(aes(xintercept=ground_truth*100, linetype="Ground Truth"), color="red") +
    scale_linetype_manual(name="Line", values = c(2,1), guide=guide_legend(override.aes=list(color=c("red")))) +
    geom_histogram(aes(y=..density..), binwidth=5) +
    # geom_density(fill = "#ff4d4d", alpha = 0.2) +
    theme_bw() +
    labs(
      caption=cond,
      x = "CLES Responses",
      y = "Frequency"
    ) +
    facet_grid(sd_diff ~ ground_truth)
  print(plt)
}

Bet Amounts

Bet amounts seem more sensitive to probability information when sd_diff is high, making uncertainty more visually salient. However, bet amounts are highly variable across the board.

for (cond in unique(responses_df$condition)) {
  plt <- responses_df %>% filter(condition == cond) %>%
    ggplot(aes(x=bet)) +
    geom_vline(aes(xintercept=ground_truth*1000, linetype="Optimal Bet"), color="red") +
    scale_linetype_manual(name="Line", values = c(2,1), guide=guide_legend(override.aes=list(color=c("red")))) +
    geom_histogram(aes(y=..density..), binwidth=50) +
    # geom_density(fill = "#ff4d4d", alpha = 0.2) +
    theme_bw() +
    labs(
      caption=cond,
      x = "Bet Amount",
      y = "Frequency"
    ) +
    facet_grid(sd_diff ~ ground_truth)
  print(plt)
}

CLES Judgments vs Bet Amounts

Under an ideal betting strategy, bet amounts should be 10 times the CLES value perceived by the participant. We can see that for intervals_w_means and means_only—visualizations where the mean is emphasized—bet amounts are too high for CLES responses above 50% and too low for CLES responses below 50%. In other words, bet amount is too sensitive to perceived probability of winning. Contrast this with HOPs, where we see the same pattern to a lesser extent and bet amount looks more like a noisy linear function of the CLES response.

for (cond in unique(responses_df$condition)) {
  plt <- responses_df %>% filter(condition == cond) %>%
    ggplot(aes(x=cles, y=bet)) +
    geom_abline(intercept=0, slope=10, color="red", linetype="dashed") +
    # scale_linetype_manual(name="Line", values = c(2,1), guide=guide_legend(override.aes=list(color=c("red")))) +
    geom_point(alpha=0.3) +
    theme_bw() +
    labs(
        caption=cond,
        x = "CLES Judgment",
        y = "Bet Amount"
    ) +
    facet_grid(sd_diff ~ ground_truth)
  print(plt)
}

Relationships with Trial Duration

We want to know when, if at all, spending more time on a response results in improved performance.

Trial Duration vs CLES Judgments

Trial duration seems mostly unrelated to CLES judgments except for in the case of HOPs, where responses seem to cluster closer to the ground truth on longer trial durations.

for (cond in unique(responses_df$condition)) {
  plt <- responses_df %>% filter(condition == cond) %>%
    ggplot(aes(x=trial_dur, y=cles)) +
    geom_hline(aes(yintercept=ground_truth*100, linetype="Ground Truth"), color="red") +
    scale_linetype_manual(name="Line", values = c(2,1), guide=guide_legend(override.aes=list(color=c("red")))) +
    geom_point(alpha=0.3) +
    theme_bw() +
    labs(
        caption=cond,
        x = "Trial Duration (sec)",
        y = "CLES Judgment"
    ) +
    facet_grid(sd_diff ~ ground_truth)
  print(plt)
}

Trial Duration vs Bet Amounts

Similar to what we see above with CLES judgments, trial duration seems mostly unrelated to bet amounts except for in the case of HOPs, where responses seem to cluster closer to the optimal bet on longer trial durations.

for (cond in unique(responses_df$condition)) {
  plt <- responses_df %>% filter(condition == cond) %>%
    ggplot(aes(x=trial_dur, y=bet)) +
    geom_hline(aes(yintercept=ground_truth*1000, linetype="Optimal Bet"), color="red") +
    scale_linetype_manual(name="Line", values = c(2,1), guide=guide_legend(override.aes=list(color=c("red")))) +
    geom_point(alpha=0.3) +
    theme_bw() +
    labs(
        caption=cond,
        x = "Trial Duration (sec)",
        y = "Bet Amount"
    ) +
    facet_grid(sd_diff ~ ground_truth)
  print(plt)
}

Responses and Ground Truth vs Heuristic Predictions

Create Heuristic Functions

The following functions describe the CLES responses predicted by possible heuristics for reading the visualizations in our pilot.

# axis range for modeling
data_domain <- c(38, 62)
axis_range <- data_domain[2] - data_domain[1]

# relative mean difference heuristic
relative_mean_difference <- function(mean_diff, max_abs_mean_diff) {
  return(50 - 50 * mean_diff / max_abs_mean_diff)
}

# mean difference relative to axis range
mean_difference_vs_axis <- function(mean_diff) {
  return(50 - 50 * mean_diff / axis_range)
}

# means first, then uncertainty heuristic
means_first_then_uncertainty_intervals <- function(mean_diff, sd_team) {
  interval_length <- qnorm(0.975)*sd_team - qnorm(0.025)*sd_team 
  return(50 - 50 * mean_diff / interval_length / 2) # assuming that the two intervals are the same length, so we don't need to take their average
}

# interval overlap relative to interval length
interval_overlap <- function(mean_diff, sd_team) {
  interval_length <- qnorm(0.975)*sd_team - qnorm(0.025)*sd_team # baseline for relative judgment (assuming that the two intervals are the same length, so we don't need to take their average)
  mean_teamA <- - mean_diff / 2 # relative to center
  mean_teamB <- mean_diff / 2 # relative to center
  # calculation depends on which mean is larger
  if(mean_teamA > mean_teamB) {
    interval_overlap <- (mean_teamB + interval_length / 2) - (mean_teamA - interval_length / 2) # upper bound of lower dist minus lower bound of higher dist
    return(100 - 50 * interval_overlap / interval_length) 
  } else { # mean_teamA < mean_teamB
    interval_overlap <- (mean_teamA + interval_length / 2) - (mean_teamB - interval_length / 2) # upper bound of lower dist minus lower bound of higher dist
    return( 50 * interval_overlap / interval_length)
  }
}

# interval overlap relative to axis range
interval_overlap_vs_axis <- function(mean_diff, sd_team) {
  interval_length <- qnorm(0.975)*sd_team - qnorm(0.025)*sd_team # baseline for relative judgment (assuming that the two intervals are the same length, so we don't need to take their average)
  mean_teamA <- - mean_diff / 2 # relative to center
  mean_teamB <- mean_diff / 2 # relative to center
  # calculation depends on which mean is larger
  if(mean_teamA > mean_teamB) {
    interval_overlap <- (mean_teamB + interval_length / 2) - (mean_teamA - interval_length / 2) # upper bound of lower dist minus lower bound of higher dist
    return(100 - 50 * interval_overlap / axis_range) 
  } else { # mean_teamA < mean_teamB
    interval_overlap <- (mean_teamA + interval_length / 2) - (mean_teamB - interval_length / 2) # upper bound of lower dist minus lower bound of higher dist
    return( 50 * interval_overlap / axis_range)
  }
}

# outcome proportion heuristic
outcome_proportion <- function(draws) {
  return(100 * sum(draws < 0) / length(draws))
}

# means over sd from HOPs heuristic
means_first_then_uncertainty_hops <- function(draws) {
  # get summary statistics from differences between draws
  mean_diff <- mean(draws)
  outcome_diff_span <- max(draws) - min(draws)
  outcome_span <- sqrt((outcome_diff_span ^ 2) / 2)
  return(50 - 50 * mean_diff / outcome_span / 2)
}

# need a consistent color scale for these heuristics
heuristics <- as.factor(c("ground_truth", 
                          "rel_mean_diff_est", "mean_diff_vs_axis_est", 
                          "means_first_then_uncertainty_intervals_est", "interval_overlap_est", "interval_overlap_vs_axis_est", 
                          "outcome_proportion_est",  "outcome_proportion_10_est", "means_first_then_uncertainty_hops_est"))
# hColors <- brewer.pal(length(heuristics), "Set1")
hColors <- c("#E31A1C",                         # from ColorBrewer 12-class Paired palette
              "#B2DF8A", "#FDBF6F",
              "#6A3D9A", "#1F78B4", "#33A02C", 
              "#CAB2D6", "#B15928", "#A6CEE3"
              )
names(hColors) <- levels(heuristics)
colScale <- scale_colour_manual(values = hColors)

Create Optimal Betting Functions

These functions define the optimal betting strategy. However, for any given CLES value, the optimal bet is \(1000 coins * Pr(A > B)\).

# set range of possible bets based on given budget and minimum bet
budget <- 1000
min_bet <- 1
possible_bets <- seq(from=min_bet, to=budget, by=1)

# create a tiered capital gains tax
tax_winnings <- function(winnings) {
  tiers <- append(seq(0, 2000, by = 500), Inf)
  rates <- seq(0, .5, by = .1)
  taxed_winnings <- sum(diff(c(0, pmin(winnings, tiers))) * (1-rates))
  return(taxed_winnings)
}

# set cost of not betting
loss_rate <- 0.25

# find the optimal bet based on the expected value of bet amounts given some CLES value
optimal_bet <- function(p_superiority_A) {
  # hack to catch p == 0
  if (p_superiority_A == 0) {
    p_superiority_A <- 0.001
  }
  # calculate utility over as set of possible bets at the given odds
  utility <- seq(from=-1, to=0, length.out = length(possible_bets))
  for (i in 1:length(possible_bets)) {
    utility[i] <- (1 - loss_rate)*(budget - possible_bets[i]) + p_superiority_A * tax_winnings(possible_bets[i] / p_superiority_A) # payoff proportional to risk
  }
  # determine the bet with the maximum expected utility
  return(possible_bets[which(utility==max(utility))])
}

Apply Prediction Functions to Data Conditions Merge With Response Data

This section of code combines the responses and stimuli-generating data into one visualization that we can use to analyze the prevalence of different heuristics.

# calcate the difference in draws for the heuristic functions
draw_differences <- conds_df %>% select(condition, Team, draws) %>% 
  spread(Team, draws) %>% 
  unnest() %>% 
  mutate(
    draws_diff=B - A, 
    A=NULL, 
    B=NULL
  ) %>% 
  group_by(condition) %>% 
  summarise(draws_diff = list(draws_diff[1:50]))

# reformat data conditions df
stimuli_data_df <- conds_df %>% 
  filter(Team %in% "A") %>% # drop duplicate rows for two teams
  left_join(draw_differences, by='condition') %>%
  mutate( # drop unnecessary columns
    condition=NULL,
    Team=NULL, 
    draws=NULL,
    draw_n=NULL,
    quantiles=NULL,
    sample_n=NULL
  )

# repeat heuristics data frame for each worker 
stimuli_data_df <- stimuli_data_df[rep(seq_len(nrow(stimuli_data_df)), times=length(unique(responses_df$worker_id))),]
stimuli_data_df$worker_id <- sort(rep(unique(responses_df$worker_id), each=(length(unique(responses_df$ground_truth))) * length(unique(responses_df$sd_diff))))

# calculate the baseline of relative mean difference heuristic)
stimuli_data_df$max_abs_mean_diff <- max(abs(stimuli_data_df$mean_diff))
# create dataframe containing heuristic estimates
heuristics_df <- stimuli_data_df %>% rowwise() %>% 
  mutate( # call heuristic functions
    ground_truth = odds_of_victory * 100,
    rel_mean_diff_est = relative_mean_difference(mean_diff, max_abs_mean_diff),
    mean_diff_vs_axis_est = mean_difference_vs_axis(mean_diff),
    means_first_then_uncertainty_intervals_est = means_first_then_uncertainty_intervals(mean_diff, sd),
    interval_overlap_est = interval_overlap(mean_diff, sd),
    interval_overlap_vs_axis_est = interval_overlap_vs_axis(mean_diff, sd),
    outcome_proportion_est = outcome_proportion(draws_diff),
    outcome_proportion_10_est = outcome_proportion(draws_diff[1:10]), # outcome proportion with only the first 10 draws
    means_first_then_uncertainty_hops_est = means_first_then_uncertainty_hops(draws_diff)
  ) %>% 
  gather(heuristic, est_cles, ground_truth, rel_mean_diff_est, mean_diff_vs_axis_est, means_first_then_uncertainty_intervals_est, interval_overlap_est, interval_overlap_vs_axis_est, outcome_proportion_est, outcome_proportion_10_est, means_first_then_uncertainty_hops_est) %>% # reshape
  rowwise() %>%
  mutate(est_bet = optimal_bet(est_cles / 100)) %>% # apply optimal bet function (if multiple optimal bets, take the lower to avoid error)
  rename(ground_truth = odds_of_victory) %>%
  arrange(worker_id, sd_diff, ground_truth, heuristic) # use same order for both data frames

# extend responses df to repeat for each heuristic
combined_df <- responses_df[rep(seq_len(nrow(responses_df)), each=length(unique(heuristics_df$heuristic))),]
combined_df$heuristic <- rep(rep(unique(heuristics_df$heuristic), times=(length(unique(responses_df$ground_truth))) * length(unique(responses_df$sd_diff))), times=length(unique(responses_df$worker_id)))

# merge response data with heuristics data
combined_df <- combined_df %>% 
  arrange(worker_id, sd_diff, ground_truth, heuristic) %>% # use same order for both data frames
  bind_cols(heuristics_df) # hack because merge and join not working
  # left_join(heuristics_df, by=c('worker_id','ground_truth', 'sd_diff','heuristic'))
  # merge(heuristics_df, on=c('worker_id','ground_truth', 'sd_diff','heuristic'), all=TRUE)

# check the binding
if (!all(combined_df$worker_id == combined_df$worker_id1) || !all(combined_df$heuristic == combined_df$heuristic1) || 
    !all(combined_df$sd_diff == combined_df$sd_diff1) || !all(combined_df$ground_truth == round(combined_df$ground_truth1, 3))) {
  print("Warning: something went horribly wrong!")
}

Plot Responses Against Heuristics

These visualizations allow us to check responses from individual workers against the predictions of the set of heuristics which are relevant to each visualization condition.

CLES Judgments

Through visual inspection of the plots below, I tallied up the apparent strategy for each worker in each level of sd_diff. Since visualization condition is a between subjects manipulation, no individual worker is contributing more than two strategy codes.

In the HOPs condition, workers seem to be using an outcome_proportion heuristic for only the first 10 draws about half of the time. The other half of the time, I cannot distinguish their strategy (a.k.a., ambiguous strategy) or they are using a means_first heuristic where they estimate the mean difference from the draws to inform their reliability judgment and then compare that to the average span of draws for each of the two teams. Only one worker seemed to actually be counting the proportion of all draws shown where \(A > B\). Switching strategies depending on sd_diff was uncommon. When workers seem to have switched strategies, they appeared to be using a means_first heuristic or an ambiguous strategy at low levels of uncertainty and then switching to an outcome_proportion heuristics when sd_diff was high.

In the means_only condition, workers seems to be using a mean_diff_vs_axis heuristic more than half the time. However, we also see many workers guessing CLES values near 50% regardless of the stimulus condition, a pattern which is indistinguishable from the mean_diff_vs_axis heuristic at low levels of sd_diff. Only a couple workers seem to be basing their sense of what makes an effect reliable on the relative mean difference (compared to the maximum mean difference shown), rather than the range of the x_axis.

In the intervals_w_means condition, workers seem to switch strategies the most, with the exception of a couple workers who seemed to consistently rely on the interval_overlap and relative_mean_difference heuristics. Similar the means_only condition a subset of workers seem to guess CLES values near 50% regardless of the stimulus condition, a pattern which is indistinguishable from the mean_diff_vs_axis heuristic at low levels of sd_diff. Although my coding scheme did not account for changing stratgies depending on the ground truth, it seems like this may be happening for some participants in this condition. Specifically, there are a few participants who overestimate small probabilities more than they underestimate large probabilities.

# plot predictions w/ participant responses
for (worker in unique(combined_df$worker_id)) {
  # filter on worker
  worker_data <- combined_df %>% filter(worker_id == worker)
  # title <- cat("Heuristic Predictions vs Ground Truth w/", worker, "Estimates of CLES")

  # filter heuristics based on condition (between subjects)
  if (worker_data$condition[1] %in% "HOPs") {
    plt <- worker_data %>% filter(heuristic %in% c("ground_truth", "means_first_then_uncertainty_hops_est", "outcome_proportion_est",  "outcome_proportion_10_est")) %>%
      ggplot(aes(x=ground_truth, y=est_cles, color=heuristic)) +
      geom_line() +
      geom_point(aes(x=ground_truth, y=cles), inherit.aes=FALSE, show.legend=FALSE) +
      colScale +
      theme_bw() +
      labs(
          x = "Ground Truth Pr(A > B)",
          y = "Estimated Pr(A > B)"
      ) +
      facet_grid(sd_diff ~ condition)
    print(plt)
  } else if (worker_data$condition[1] %in% "means_only") {
    plt <- worker_data %>% filter(heuristic %in% c("ground_truth", "rel_mean_diff_est", "mean_diff_vs_axis_est")) %>%
      ggplot(aes(x=ground_truth, y=est_cles, color=heuristic)) +
      geom_line() +
      geom_point(aes(x=ground_truth, y=cles), inherit.aes=FALSE, show.legend=FALSE) +
      colScale +
      theme_bw() +
      labs(
          x = "Ground Truth Pr(A > B)",
          y = "Estimated Pr(A > B)"
      ) +
      facet_grid(sd_diff ~ condition)
    print(plt)
  } else { # intervals
    plt <- worker_data %>% filter(heuristic %in% c("ground_truth", "rel_mean_diff_est", "mean_diff_vs_axis_est", "means_first_then_uncertainty_intervals_est", "interval_overlap_est", "interval_overlap_vs_axis_est")) %>%
      ggplot(aes(x=ground_truth, y=est_cles, color=heuristic)) +
      geom_line() +
      geom_point(aes(x=ground_truth, y=cles), inherit.aes=FALSE, show.legend=FALSE) +
      colScale +
      theme_bw() +
      labs(
          caption=worker,
          x = "Ground Truth Pr(A > B)",
          y = "Estimated Pr(A > B)"
      ) +
      facet_grid(sd_diff ~ condition)
    print(plt)
  }
}

# heuristics_df %>% ggplot(aes(x=odds_of_victory, y=est_cles, color=heuristic)) +
#   geom_line() +
#   geom_point(data=responses_df, aes(x=groundTruth, y=cles, alpha=0.3), inherit.aes=FALSE, show.legend=FALSE) +
#   colScale +
#   theme_bw() +
#   labs(title = "Heuristic Predictions vs Ground Truth w/ Worker Estimates of CLES",
#       x = "Ground Truth Pr(A > B)",
#       y = "Estimated Pr(A > B)"
#   ) +
#   facet_grid(sd_diff ~ condition)

Bet Amounts

Since the betting data is noisier than the CLES judgments, I did not tally strategies for these responses. By visual inspection of the plots below, my sense is that many participants tend to bet amounts either near 0, near 500, or near 1000 coins. Workers who make consistent bets despite varying probability of victory seem to break out of their pattern most often when odds are extreme (e.g., opting to bet more when they are sure they will win or bet less when they are sure they will lose). Some workers seem to employ more of an optimal betting strategy, mostly workers in the HOPs and intervals_w_means conditions, especially when sd_diff is large. For most workers, bets seem to show a curvelinear relationship with the ground truth rather than the optimal linear relationship. This brings to mind the possibility that a linear log odds model would account for betting behavior.

# plot predictions w/ participant responses
for (worker in unique(combined_df$worker_id)) {
  # filter on worker
  worker_data <- combined_df %>% filter(worker_id == worker)
  # title <- cat("Heuristic Predictions vs Ground Truth w/", worker, "Estimates of CLES")

  # filter heuristics based on condition (between subjects)
  if (worker_data$condition[1] %in% "HOPs") {
    plt <- worker_data %>% filter(heuristic %in% c("ground_truth", "means_first_then_uncertainty_hops_est", "outcome_proportion_est",  "outcome_proportion_10_est")) %>%
      ggplot(aes(x=ground_truth, y=est_bet, color=heuristic)) +
      geom_line() +
      geom_point(aes(x=ground_truth, y=bet), inherit.aes=FALSE, show.legend=FALSE) +
      colScale +
      theme_bw() +
      labs(
          x = "Ground Truth Pr(A > B)",
          y = "Bet Amount"
      ) +
      facet_grid(sd_diff ~ condition)
    print(plt)
  } else if (worker_data$condition[1] %in% "means_only") {
    plt <- worker_data %>% filter(heuristic %in% c("ground_truth", "rel_mean_diff_est", "mean_diff_vs_axis_est")) %>%
      ggplot(aes(x=ground_truth, y=est_bet, color=heuristic)) +
      geom_line() +
      geom_point(aes(x=ground_truth, y=bet), inherit.aes=FALSE, show.legend=FALSE) +
      colScale +
      theme_bw() +
      labs(
          x = "Ground Truth Pr(A > B)",
          y = "Bet Amount"
      ) +
      facet_grid(sd_diff ~ condition)
    print(plt)
  } else { # intervals
    plt <- worker_data %>% filter(heuristic %in% c("ground_truth", "rel_mean_diff_est", "mean_diff_vs_axis_est", "means_first_then_uncertainty_intervals_est", "interval_overlap_est", "interval_overlap_vs_axis_est")) %>%
      ggplot(aes(x=ground_truth, y=est_bet, color=heuristic)) +
      geom_line() +
      geom_point(aes(x=ground_truth, y=bet), inherit.aes=FALSE, show.legend=FALSE) +
      colScale +
      theme_bw() +
      labs(
          x = "Ground Truth Pr(A > B)",
          y = "Bet Amount"
      ) +
      facet_grid(sd_diff ~ condition)
    print(plt)
  }
}

# heuristics_df %>% ggplot(aes(x=odds_of_victory, y=est_bet, color=heuristic)) +
#   geom_line() +
#   geom_point(data=responses_df, aes(x=groundTruth, y=bet, alpha=0.3), inherit.aes=FALSE, show.legend=FALSE) +
#   colScale +
#   theme_bw() +
#   labs(title = "Heuristic Predictions vs Ground Truth w/ Worker Bets",
#       x = "Ground Truth Pr(A > B)",
#       y = "Estimated Bet Amount"
#   ) +
#   facet_grid(sd_diff ~ condition)

Error Analysis

In this section, we look for patterns of interest in response errors.

# calculate error and absolute error, add to df
combined_df <- combined_df %>% 
  mutate(
    err_cles = est_cles - cles,
    abs_err_cles = abs(err_cles),
    err_bet = est_bet - bet,
    abs_err_bet = abs(err_bet)
  )

Mean Error Per Visualization Condition

Collapsing across data conditions is a little reductive, but it is probably important to look at the overall pattern of absolute errors across visualization conditions.

On average, errors in CLES judgments are smallest in the HOPs condition.

# avg absolute error per condition
combined_df %>% 
  filter(heuristic %in% "ground_truth") %>%
  group_by(condition) %>%
  summarise(avg_abs_err_cles = mean(abs_err_cles)) %>%
  ggplot(aes(x=condition, y=avg_abs_err_cles, fill=condition)) +
    geom_bar(stat="identity") +
    theme_bw() +
    labs(title = "Average Absolute Error Relative to Ground Truth",
        x = "Visualization Condition",
        y = "Average Absolute Error"
    )

On average, errors in bet amounts seem relatively similar across visualization conditions.

# avg absolute error per condition
combined_df %>% 
  filter(heuristic %in% "ground_truth") %>%
  group_by(condition) %>%
  summarise(avg_abs_err_bet = mean(abs_err_bet)) %>%
  ggplot(aes(x=condition, y=avg_abs_err_bet, fill=condition)) +
    geom_bar(stat="identity") +
    theme_bw() +
    labs(title = "Average Absolute Error Relative to Optimal Bet",
        x = "Visualization Condition",
        y = "Average Absolute Error"
    )

Mean Error vs Ground Truth

Looking at the average signed errors in CLES estimates by condition, we can see that HOPs lead to less biased CLES judgments for extreme probabilities.

# error by ground truth, per condition
combined_df %>%
  filter(heuristic %in% "ground_truth") %>%
  group_by(sd_diff, ground_truth, condition) %>%
  summarise(avg_err_cles = mean(err_cles)) %>%
  ggplot(aes(x=ground_truth, y=avg_err_cles, color=condition)) +
    geom_line() +
    theme_bw() +
    labs(title = "Average Error Relative to Ground Truth",
        x = "Ground Truth Pr(A > B)",
        y = "Average Error"
    ) +
    facet_grid(sd_diff ~ .)

The average signed errors in bet amounts by condition show a more complex pattern. Note that there is a general bias toward betting too much except when the probability of winning is below 25%. This bias seems absent in the intervals_w_means condition when sd_diff is low, and this bias seems most consistent in the HOPs condition. However, it is hard to know whether these patterns are robust.

# error by ground truth, per condition
combined_df %>%
  filter(heuristic %in% "ground_truth") %>%
  group_by(sd_diff, ground_truth, condition) %>%
  summarise(avg_err_bet = mean(err_bet)) %>%
  ggplot(aes(x=ground_truth, y=avg_err_bet, color=condition)) +
    geom_line() +
    theme_bw() +
    labs(title = "Average Error Relative to Optimal Bet",
        x = "Ground Truth Pr(A > B)",
        y = "Average Error"
    ) +
    facet_grid(sd_diff ~ .)

Check Bias Depending on Winner of Game

We want to know whether the probable winner of the game (i.e., whether ground truth CLES is greater than or less than 50%) has an impact on responses. This would show up as an asymmetry between errors depending on the winner of the game.

The chart of average signed errors for CLES judgments below shows such an asymmetry for the HOPs condition especially. In particular, HOPs seem less biased than other conditions, particularly when the ground truth CLES is less than 50%. However, it is hard to tell whether this relationship is robust.

# reflect error where Pr(A > B) < 0.5 onto range between 0.5 and 1
combined_df %>%
  filter(heuristic %in% "ground_truth") %>%
  mutate(
    ground_truth_50_100 = ifelse(ground_truth < 0.5, 1 - ground_truth, ground_truth),
    winner = ifelse(outcome == "True", "A", "B")
  ) %>%
  group_by(sd_diff, ground_truth_50_100, condition, winner) %>%
  summarise(avg_err_cles = mean(err_cles)) %>%
  ggplot(aes(x=ground_truth_50_100, y=avg_err_cles, color=condition)) +
    geom_line(aes(linetype=winner)) +
    theme_bw() +
    labs(title = "Average Error in CLES Judgments Relative to Probability of Superiority for Winner",
        x = "Ground Truth Pr(Win)",
        y = "Average Error"
    ) +
    facet_grid(sd_diff ~ .)

As we’ve seen with other metrics, bet amounts show a more complex pattern. Again, we can see that HOPs seem to promote betting too much when the ground truth CLES is close to 50%. It is hard to tell whether the minor asymmetries in this plot are meaningful.

# reflect error where Pr(A > B) < 0.5 onto range between 0.5 and 1
combined_df %>%
  filter(heuristic %in% "ground_truth") %>%
  mutate(
    ground_truth_50_100 = ifelse(ground_truth < 0.5, 1 - ground_truth, ground_truth),
    winner = ifelse(outcome == "True", "A", "B")
  ) %>%
  group_by(sd_diff, ground_truth_50_100, condition, winner) %>%
  summarise(avg_err_bet = mean(err_bet)) %>%
  ggplot(aes(x=ground_truth_50_100, y=avg_err_bet, color=condition)) +
    geom_line(aes(linetype=winner)) +
    theme_bw() +
    labs(title = "Average Error in Bet Amounts Relative to Probability of Superiority for Winner",
        x = "Ground Truth Pr(Win)",
        y = "Average Error"
    ) +
    facet_grid(sd_diff ~ .)

Model Building

We start out with a simple non-hierarchical model for the probability of one heuristic vs ground truth in one visualization condition, HOPs. Then we model the probability of more than one alternative heuristic. Then we’ll add hierarchy to that model. Finally, we’ll add in the other visualization conditions.

Prepare the Data Frame for Modeling in Stan

First, we need the data in a format where it is prepared for modeling in Stan. We will calculate the heuristic predictions in Stan, so we just need to get the stimuli-generating data and the worker response data in a single dataframe with one row per worker * trial.

# create data frame for model by merging stimuli-generating data with responses
model_df <- stimuli_data_df %>%
  mutate( # create rounded version of ground_truth to merge on, leaving unrounded value stored in odds_of_victory
    ground_truth=round(odds_of_victory, 3)
  ) %>%
  full_join(responses_df, by=c("worker_id", "sd_diff", "ground_truth")) %>%
  rename( # rename ground_truth columns, so it is clear which is rounded and which should be used in the model
    ground_truth_rounded=ground_truth,
    ground_truth=odds_of_victory
  )

Non-Hierarchical Model of the Probability of the Outcome Proportion Heuristic vs Ground Truth in the HOPs Condition

We start with the simplest possible version of the model we’d like to build. Basically, we are modeling the probability that a worker in the HOPs condition will use the outcome proportion heuristic vs the ground truth. This model does not differentiate between workers (no hierarchy, i.e., random effect of worker), focuses on one visualization condition, and ignores all but two possible heuristics. It is a first step toward a much more sophisticated model.

Prepare Data List for Modeling in Stan

Include only the HOPs condition.

# filter condition
model_df_hops <- model_df %>% filter(condition=="HOPs")

# create data list
data_hops_one <- list(
  n=length(model_df_hops$trial),                  # total observations (i.e., trials)
  cles=model_df_hops$cles,                        # the worker's response on each trial
  ground_truth=model_df_hops$ground_truth,        # the ground truth cles value on each trial
  n_draws=50,                                     # number of draws displayed in the HOPs per trial
  # draws=model_df_hops$draws_diff                  # multidimensional array of the difference between draws B - A shown in HOPs trials * draws
  draws=do.call("rbind",model_df_hops$draws_diff) # trials * draws matrix of the difference between draws B - A shown in HOPs
)

Compile and Run the Model

The model is in the file “stan/hops-only_one-heuristic.stan”. We’ll compile the stan code, convert to a model object, and then fit the model.

# fit model
m.hops.one_heuristic <- sampling(stan_model, data=data_hops_one, control=list(adapt_delta=0.99), chain=2, cores=2)

Check diagnostics:

  • Trace plots
# trace plots
traceplot(m.hops.one_heuristic)

  • Pairs plot. These look a little skewed, but that might be just because sigma is bounded.
# pairs plot
pairs(m.hops.one_heuristic, pars = c('p_heuristic', 'sigma'))

  • Summary
# model summary
print(m.hops.one_heuristic)
## Inference for Stan model: hops-only_one-heuristic.
## 2 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=2000.
## 
##              mean se_mean   sd  2.5%   25%   50%   75% 97.5% n_eff Rhat
## p_heuristic  0.50    0.01 0.19  0.14  0.35  0.50  0.64  0.87   998    1
## sigma        0.54    0.01 0.43  0.10  0.23  0.39  0.72  1.69   947    1
## lp__        -2.38    0.04 1.02 -5.21 -2.74 -2.05 -1.66 -1.43   684    1
## 
## Samples were drawn using NUTS(diag_e) at Wed Feb 27 14:21:03 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

The model has learned that users seem fairly evenly split between the outcome proportion heuristic and the ground truth when we only allow for those two strategies. The high variability of this estimate suggests that there is a lot of information which is not accounted for by this model (which we would expect since this binary choice of strategy probably does not represent the true data-generating process).

Non-Hierarchical Model of the Probability of Multiple Heuristics in the HOPs Condition

Let’s step up the complexity of the model just a little bit by adding in the possibility of additional heuristics. Now, we’ll consider as alternative heuristics: the ground truth, outcome proportion, outcome proportion for only the first ten trials, and an ensemble means/spread heuristic.

Still some important things are missing from this model. The model does not differentiate between workers and focuses on just one visualization condition. We will add in these additional considerations in later iterations.

Prepare Data List for Modeling in Stan

# create data list
data_hops_multi <- list(
  n=length(model_df_hops$trial),                  # total observations (i.e., trials)
  cles=model_df_hops$cles,                        # the worker's response on each trial
  ground_truth=model_df_hops$ground_truth,        # the ground truth cles value on each trial
  n_heuristic=4,                                  # the number of alternative heuristics we model
  n_draws=50,                                     # number of draws displayed in the HOPs per trial
  draws=do.call("rbind",model_df_hops$draws_diff) # trials * draws matrix of the difference between draws B - A shown in HOPs
)

Compile and Run the Model

The model is in the file “stan/hops-only_multi-heuristic.stan”. We’ll compile the stan code, convert to a model object, and then fit the model.

# fit model
m.hops.multi_heuristic <- sampling(stan_model, data=data_hops_multi, control=list(adapt_delta=0.99), chain=2, cores=2)

Check diagnostics:

  • Trace plots
# trace plots
traceplot(m.hops.multi_heuristic)

  • Pairs plot. These look a little skewed again perhaps because sigma and p_heuristic are bounded. The strong correlation between mu_lo_heuristic and p_heuristic is expected since p_heuristic = softmax(mu_lo_heuristic).
# pairs plot
pairs(m.hops.multi_heuristic, pars = c('mu_lo_heuristic','sigma','p_heuristic'))

  • Summary
# model summary
print(m.hops.multi_heuristic)
## Inference for Stan model: hops-only_multi-heuristic.
## 2 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=2000.
## 
##                     mean se_mean   sd  2.5%   25%   50%   75% 97.5% n_eff
## mu_lo_heuristic[1]  0.00    0.02 0.79 -1.54 -0.53 -0.02  0.52  1.61  2000
## mu_lo_heuristic[2]  0.01    0.02 0.81 -1.66 -0.53  0.04  0.56  1.49  1876
## mu_lo_heuristic[3]  0.00    0.02 0.82 -1.59 -0.57  0.02  0.57  1.51  1612
## mu_lo_heuristic[4] -0.02    0.02 0.83 -1.70 -0.58 -0.02  0.52  1.57  1678
## sigma               0.33    0.01 0.21  0.12  0.19  0.26  0.39  0.90  1061
## p_heuristic[1]      0.25    0.00 0.14  0.05  0.14  0.22  0.33  0.58  2532
## p_heuristic[2]      0.25    0.00 0.14  0.05  0.14  0.23  0.35  0.57  2341
## p_heuristic[3]      0.25    0.00 0.14  0.05  0.14  0.22  0.34  0.59  2249
## p_heuristic[4]      0.25    0.00 0.14  0.04  0.13  0.22  0.34  0.58  2428
## lp__               -5.75    0.05 1.56 -9.82 -6.51 -5.44 -4.62 -3.66   944
##                    Rhat
## mu_lo_heuristic[1]    1
## mu_lo_heuristic[2]    1
## mu_lo_heuristic[3]    1
## mu_lo_heuristic[4]    1
## sigma                 1
## p_heuristic[1]        1
## p_heuristic[2]        1
## p_heuristic[3]        1
## p_heuristic[4]        1
## lp__                  1
## 
## Samples were drawn using NUTS(diag_e) at Wed Feb 27 14:21:30 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

Again, the model seems to have learned a fairly even split between heuristics such that the mean of the posteriors for p_heuristic ~= 1 / n_heuristic. Note that these estimates are pretty uncertain.

Hierarchical Model of the Probability of Multiple Heuristics in the HOPs Condition

Hierarchical Model of the Probability of Multiple Heuristics for HOPs, Means Only, & Intervals with Means

Linear Log Odds Model (Plan B)

To get a sense of what we can infer from the data, we fit a linear model of CLES judgments vs the ground truth in log odds units. This model follows from related work suggesting that the human perception of probability is encoded on a log odds scale. On this scale, the slope of a linear model represents the shape and severity of the function describing bias in probability perception. The greater the deviation of from a slope of 1 (i.e., ideal performance), the more biased the judgments of probability. Slopes less than one correspond to the kind of bias predicted by excessive attention to the mean. On the same log odds scale, the intercept is a crossover-point which should be proportional to the number of categories of possible outcomes among which probability is divided. In our case, the intercept should be about 0.5 since workers are judging the probability of team A vs team B winning a round of a simulated game.

Prepare Data for Modeling

Convert response and ground truth units to log odds.

# # convert p to log odds (use qlogis)
# logit <- function(p) {
#   return(log(p) / log(1 - p))
# }
# # convert log odds to p (use plogis)
# inv_logit <- function(lo) {
#   return(exp(lo) / (1 + exp(lo)))
# }

# apply logit function to cles judgments and ground truth
model_df_llo <- model_df %>%
  mutate(
    # lo_cles=logit(cles / 100),
    # lo_ground_truth=logit(ground_truth)
    lo_cles=qlogis(cles / 100),
    lo_ground_truth=qlogis(ground_truth)
  )

Create and Run the Model in BRMs

We’ll use the brms package for this model since it is relatively more orthodox than our models of the probability of alternative heuristics, and we do not really need to customize our likelihood function.

Distribution of CLES

We start as simple as possible by just modeling the distribution of CLES on the log odds scale.

Let’s check that our priors seem reasonable.

# prior predictive check
n <- 1e4
tibble(sample_mu    = rnorm(n, mean = 0, sd = 1),
       sample_sigma = rnorm(n, mean = 0, sd = 1)) %>% 
  mutate(lo_cles = rnorm(n, mean = sample_mu, sd = sample_sigma),
         cles = plogis(lo_cles)) %>% 
  ggplot(aes(x = cles)) +
  geom_density(fill = "black", size = 0) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = expression(paste("Prior predictive distribution for ", italic(h[i]))),
       lo_cles = NULL) +
  theme(panel.grid = element_blank())
## Warning in rnorm(n, mean = sample_mu, sd = sample_sigma): NAs produced
## Warning: Removed 5039 rows containing non-finite values (stat_density).

Fit an intercept model.

# starting as simple as possible: learn the distribution of lo_cles
m.lo_cles <- brm(data = model_df_llo, family = gaussian,
              lo_cles ~ 1,
              prior = c(prior(normal(0, 1), class = Intercept),
                        prior(normal(0, 1), class = sigma)),
              iter = 3000, warmup = 500, chains = 2, cores = 2)
## Compiling the C++ model
## Start sampling

Check diagnostics:

  • Trace plots. The value of sigma seems pretty large.
# trace plots
plot(m.lo_cles)

  • Pairs plot. These look great.
# pairs plot
pairs(m.lo_cles)

  • Summary
# model summary
print(m.lo_cles)
##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: lo_cles ~ 1 
##    Data: model_df_llo (Number of observations: 780) 
## Samples: 2 chains, each with iter = 3000; warmup = 500; thin = 1;
##          total post-warmup samples = 5000
## 
## Population-Level Effects: 
##           Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## Intercept     0.03      0.05    -0.07     0.13       4900 1.00
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sigma     1.50      0.04     1.43     1.57       4740 1.00
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).
  • Posterior draws
# get posterior draws
post <- posterior_samples(m.lo_cles)

# get summary stats on distribution of each parameter
post %>% 
  select(-lp__) %>% # drop log probability of posterior draws
  gather(parameter) %>%
  group_by(parameter) %>%
  mean_qi(value)
## # A tibble: 2 x 7
##   parameter    value  .lower .upper .width .point .interval
##   <chr>        <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 b_Intercept 0.0274 -0.0742  0.128   0.95 mean   qi       
## 2 sigma       1.50    1.43    1.57    0.95 mean   qi
  • Posterior predictive distribution. This isn’t working because exp(post_lo_cles) => inf.
# posterior predictive check
model_df_llo %>%
  select(lo_ground_truth) %>% # not used
  add_predicted_draws(m.lo_cles, prediction = "lo_cles", seed = 1234) %>% # get draws from fitted model
  mutate(post_cles = plogis(lo_cles)) %>% # transform
  ggplot(aes(x=post_cles)) +
  geom_density(fill = "black", size = 0) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = "Posterior predictive distribution for CLES",
       post_cles = NULL) +
  theme(panel.grid = element_blank())

Linear Log Odds Model of CLES

Now well add in a slope parameter. Hopefully, this will reduce the residual variance.

Let’s check that our priors seem reasonable.

# prior predictive check
n <- 1e4
tibble(intercept    = rnorm(n, mean = 0, sd = 1),
       slope        = rnorm(n, mean = 0, sd = 1),
       sample_sigma = rnorm(n, mean = 0, sd = 1),
       lo_ground_truth = list(qlogis(unique(stimuli_data_df$odds_of_victory)))) %>% 
  unnest() %>%
  mutate(lo_cles = rnorm(n * length(unique(stimuli_data_df$odds_of_victory)), mean = intercept + slope * lo_ground_truth, sd = sample_sigma),
         cles = plogis(lo_cles)) %>% 
  ggplot(aes(x = cles)) +
  geom_density(fill = "black", size = 0) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = expression(paste("Prior predictive distribution for ", italic(h[i]))),
       lo_cles = NULL) +
  theme(panel.grid = element_blank())
## Warning in rnorm(n * length(unique(stimuli_data_df$odds_of_victory)), mean
## = intercept + : NAs produced
## Warning: Removed 49820 rows containing non-finite values (stat_density).

What we get using generic weakly informative priors seems reasonable given that we are sampling more at extreme levels of ground truth than values in the middle of the scale.

Now, let’s fit the LLO model.

# linear log odds model: lo_cles ~ 1 + lo_ground_truth
m.llo_cles <- brm(data = model_df_llo, family = gaussian,
              lo_cles ~ 1 + lo_ground_truth,
              prior = c(prior(normal(0, 1), class = Intercept),
                        prior(normal(0, 1), class = b),
                        prior(normal(0, 1), class = sigma)),
              iter = 3000, warmup = 500, chains = 2, cores = 2)
## Compiling the C++ model
## Start sampling

Check diagnostics:

  • Trace plots
# trace plots
plot(m.llo_cles)

  • Pairs plot. It looks like we might have an issue with multicolinearity because b_Intercept and b_lo_ground_truth are so highly correlated.
# pairs plot
pairs(m.llo_cles)

  • Summary
# model summary
print(m.llo_cles)
##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: lo_cles ~ 1 + lo_ground_truth 
##    Data: model_df_llo (Number of observations: 780) 
## Samples: 2 chains, each with iter = 3000; warmup = 500; thin = 1;
##          total post-warmup samples = 5000
## 
## Population-Level Effects: 
##                 Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## Intercept           0.03      0.05    -0.07     0.12       4323 1.00
## lo_ground_truth     0.30      0.02     0.26     0.34       3960 1.00
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sigma     1.32      0.03     1.26     1.39       5371 1.00
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).
  • Posterior draws
# get posterior draws
post <- posterior_samples(m.llo_cles)

# get summary stats on distribution of each parameter
post %>% 
  select(-lp__) %>% # drop log probability of posterior draws
  gather(parameter) %>%
  group_by(parameter) %>%
  mean_qi(value)
## # A tibble: 3 x 7
##   parameter          value  .lower .upper .width .point .interval
##   <chr>              <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 b_Intercept       0.0263 -0.0658  0.118   0.95 mean   qi       
## 2 b_lo_ground_truth 0.302   0.264   0.340   0.95 mean   qi       
## 3 sigma             1.32    1.26    1.39    0.95 mean   qi
  • Posterior predictive distribution.
# posterior predictive check
model_df_llo %>%
  select(lo_ground_truth) %>%
  add_predicted_draws(m.llo_cles, prediction = "lo_cles", seed = 1234) %>%
  mutate(post_cles = plogis(lo_cles)) %>%
  ggplot(aes(x=post_cles)) +
  geom_density(fill = "black", size = 0) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = "Posterior predictive distribution for CLES",
       post_cles = NULL) +
  theme(panel.grid = element_blank())

# post %>% # It doesn't seem right that this is so asymmetrical.
#   mutate( # add sampled values of lo_ground_truth to table
#     lo_ground_truth = list(qlogis(unique(stimuli_data_df$odds_of_victory)))
#   ) %>% 
#   unnest() %>%
#   mutate( # run posterior samples through model
#     mu = b_Intercept + b_lo_ground_truth * lo_ground_truth,
#     post_lo_cles = rnorm(mu, sigma),
#     post_cles = plogis(post_lo_cles)
#   ) %>%
#   ggplot(aes(x=post_cles)) +
#   geom_density(fill = "black", size = 0) +
#   scale_y_continuous(NULL, breaks = NULL) +
#   labs(subtitle = "Posterior predictive distribution for CLES",
#        post_cles = NULL) +
#   theme(panel.grid = element_blank())

How does the posterior predictive compare to the observed data? The posterior predictive seems too wide.

# data density
model_df_llo %>%
  ggplot(aes(x=cles)) +
  geom_density(fill = "black", size = 0) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = "Data distribution for CLES",
       cles = NULL) +
  theme(panel.grid = element_blank())

Let’s take a look at some of the estimated linear models.

# plot estimated linear models against observed data
model_df_llo %>%
ggplot(aes(x = lo_ground_truth, y = lo_cles)) +
  geom_abline(intercept = post[1:20, 1],
              slope     = post[1:20, 2],
              size = 1/3, alpha = .3) +
  geom_point(shape = 1, size = 2, color = "royalblue") +
  coord_cartesian(xlim = quantile(model_df_llo$lo_ground_truth, c(0, 1)),
                  ylim = quantile(model_df_llo$lo_cles, c(0, 1))) +
  theme_bw() +
  theme(panel.grid = element_blank())